In [1]:
import polars as pl
import sqlite3
import plotly.express as px
import missingno as msno
import using polars (something new...)
In [2]:
jobs = pl.read_csv("lightcast_job_postings.csv")
In [3]:
print(jobs.shape)
(72476, 131)
In [4]:
print(jobs.columns)
['ID', 'LAST_UPDATED_DATE', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'URL', 'ACTIVE_URLS', 'ACTIVE_SOURCES_INFO', 'TITLE_RAW', 'BODY', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_RAW', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION', 'CITY', 'CITY_NAME', 'COUNTY', 'COUNTY_NAME', 'MSA', 'MSA_NAME', 'STATE', 'STATE_NAME', 'COUNTY_OUTGOING', 'COUNTY_NAME_OUTGOING', 'COUNTY_INCOMING', 'COUNTY_NAME_INCOMING', 'MSA_OUTGOING', 'MSA_NAME_OUTGOING', 'MSA_INCOMING', 'MSA_NAME_INCOMING', 'NAICS2', 'NAICS2_NAME', 'NAICS3', 'NAICS3_NAME', 'NAICS4', 'NAICS4_NAME', 'NAICS5', 'NAICS5_NAME', 'NAICS6', 'NAICS6_NAME', 'TITLE', 'TITLE_NAME', 'TITLE_CLEAN', 'SKILLS', 'SKILLS_NAME', 'SPECIALIZED_SKILLS', 'SPECIALIZED_SKILLS_NAME', 'CERTIFICATIONS', 'CERTIFICATIONS_NAME', 'COMMON_SKILLS', 'COMMON_SKILLS_NAME', 'SOFTWARE_SKILLS', 'SOFTWARE_SKILLS_NAME', 'ONET', 'ONET_NAME', 'ONET_2019', 'ONET_2019_NAME', 'CIP6', 'CIP6_NAME', 'CIP4', 'CIP4_NAME', 'CIP2', 'CIP2_NAME', 'SOC_2021_2', 'SOC_2021_2_NAME', 'SOC_2021_3', 'SOC_2021_3_NAME', 'SOC_2021_4', 'SOC_2021_4_NAME', 'SOC_2021_5', 'SOC_2021_5_NAME', 'LOT_CAREER_AREA', 'LOT_CAREER_AREA_NAME', 'LOT_OCCUPATION', 'LOT_OCCUPATION_NAME', 'LOT_SPECIALIZED_OCCUPATION', 'LOT_SPECIALIZED_OCCUPATION_NAME', 'LOT_OCCUPATION_GROUP', 'LOT_OCCUPATION_GROUP_NAME', 'LOT_V6_SPECIALIZED_OCCUPATION', 'LOT_V6_SPECIALIZED_OCCUPATION_NAME', 'LOT_V6_OCCUPATION', 'LOT_V6_OCCUPATION_NAME', 'LOT_V6_OCCUPATION_GROUP', 'LOT_V6_OCCUPATION_GROUP_NAME', 'LOT_V6_CAREER_AREA', 'LOT_V6_CAREER_AREA_NAME', 'SOC_2', 'SOC_2_NAME', 'SOC_3', 'SOC_3_NAME', 'SOC_4', 'SOC_4_NAME', 'SOC_5', 'SOC_5_NAME', 'LIGHTCAST_SECTORS', 'LIGHTCAST_SECTORS_NAME', 'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3', 'NAICS_2022_3_NAME', 'NAICS_2022_4', 'NAICS_2022_4_NAME', 'NAICS_2022_5', 'NAICS_2022_5_NAME', 'NAICS_2022_6', 'NAICS_2022_6_NAME']
In [5]:
schema = pl.DataFrame({
'Column': jobs.columns,
'Data Type': jobs.dtypes,
'Unique Values': [jobs.select(pl.col(col).n_unique()).item() for col in jobs.columns]
})
print(schema.shape)
(131, 3)
In [6]:
print(schema.head(20))
shape: (20, 3) ┌────────────────────────┬───────────┬───────────────┐ │ Column ┆ Data Type ┆ Unique Values │ │ --- ┆ --- ┆ --- │ │ str ┆ object ┆ i64 │ ╞════════════════════════╪═══════════╪═══════════════╡ │ ID ┆ String ┆ 72476 │ │ LAST_UPDATED_DATE ┆ String ┆ 169 │ │ LAST_UPDATED_TIMESTAMP ┆ String ┆ 174 │ │ DUPLICATES ┆ Int64 ┆ 72 │ │ POSTED ┆ String ┆ 153 │ │ … ┆ … ┆ … │ │ MODELED_DURATION ┆ Int64 ┆ 61 │ │ COMPANY ┆ Int64 ┆ 12303 │ │ COMPANY_NAME ┆ String ┆ 12302 │ │ COMPANY_RAW ┆ String ┆ 17214 │ │ COMPANY_IS_STAFFING ┆ Boolean ┆ 2 │ └────────────────────────┴───────────┴───────────────┘
some cleaning up first (round 1)...
In [7]:
columns_to_drop = [
'ID', 'URL', 'ACTIVE_URLS', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES',
'NAICS2', 'NAICS2_NAME', 'NAICS3', 'NAICS3_NAME', 'NAICS4', 'NAICS4_NAME',
'NAICS5', 'NAICS5_NAME', 'NAICS6', 'NAICS6_NAME',
'SOC_2021_2', 'SOC_2021_2_NAME', 'SOC_2021_3', 'SOC_2021_3_NAME',
'SOC_2021_4', 'SOC_2021_4_NAME', 'SOC_2', 'SOC_2_NAME',
'SOC_3', 'SOC_3_NAME', 'SOC_4', 'SOC_4_NAME', 'SOC_5', 'SOC_5_NAME',
'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3', 'NAICS_2022_3_NAME',
'NAICS_2022_4', 'NAICS_2022_4_NAME', 'NAICS_2022_5', 'NAICS_2022_5_NAME'
]
jobs_filtered = jobs.drop(columns_to_drop)
print(jobs_filtered.columns)
['LAST_UPDATED_DATE', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'ACTIVE_SOURCES_INFO', 'TITLE_RAW', 'BODY', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_RAW', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION', 'CITY', 'CITY_NAME', 'COUNTY', 'COUNTY_NAME', 'MSA', 'MSA_NAME', 'STATE', 'STATE_NAME', 'COUNTY_OUTGOING', 'COUNTY_NAME_OUTGOING', 'COUNTY_INCOMING', 'COUNTY_NAME_INCOMING', 'MSA_OUTGOING', 'MSA_NAME_OUTGOING', 'MSA_INCOMING', 'MSA_NAME_INCOMING', 'TITLE', 'TITLE_NAME', 'TITLE_CLEAN', 'SKILLS', 'SKILLS_NAME', 'SPECIALIZED_SKILLS', 'SPECIALIZED_SKILLS_NAME', 'CERTIFICATIONS', 'CERTIFICATIONS_NAME', 'COMMON_SKILLS', 'COMMON_SKILLS_NAME', 'SOFTWARE_SKILLS', 'SOFTWARE_SKILLS_NAME', 'ONET', 'ONET_NAME', 'ONET_2019', 'ONET_2019_NAME', 'CIP6', 'CIP6_NAME', 'CIP4', 'CIP4_NAME', 'CIP2', 'CIP2_NAME', 'SOC_2021_5', 'SOC_2021_5_NAME', 'LOT_CAREER_AREA', 'LOT_CAREER_AREA_NAME', 'LOT_OCCUPATION', 'LOT_OCCUPATION_NAME', 'LOT_SPECIALIZED_OCCUPATION', 'LOT_SPECIALIZED_OCCUPATION_NAME', 'LOT_OCCUPATION_GROUP', 'LOT_OCCUPATION_GROUP_NAME', 'LOT_V6_SPECIALIZED_OCCUPATION', 'LOT_V6_SPECIALIZED_OCCUPATION_NAME', 'LOT_V6_OCCUPATION', 'LOT_V6_OCCUPATION_NAME', 'LOT_V6_OCCUPATION_GROUP', 'LOT_V6_OCCUPATION_GROUP_NAME', 'LOT_V6_CAREER_AREA', 'LOT_V6_CAREER_AREA_NAME', 'LIGHTCAST_SECTORS', 'LIGHTCAST_SECTORS_NAME', 'NAICS_2022_6', 'NAICS_2022_6_NAME']
In [8]:
jobs_filtered.shape
Out[8]:
(72476, 94)
In [9]:
nan_counts = jobs_filtered.null_count()
# looking at all columns
nan_counts_long = nan_counts.unpivot().rename({"variable": "Column", "value": "NaN Count"})
print(nan_counts_long.shape)
(94, 2)
In [10]:
print(nan_counts_long.head(20))
shape: (20, 2) ┌───────────────────────┬───────────┐ │ Column ┆ NaN Count │ │ --- ┆ --- │ │ str ┆ u32 │ ╞═══════════════════════╪═══════════╡ │ LAST_UPDATED_DATE ┆ 0 │ │ POSTED ┆ 0 │ │ EXPIRED ┆ 7822 │ │ DURATION ┆ 27294 │ │ SOURCE_TYPES ┆ 0 │ │ … ┆ … │ │ EDUCATION_LEVELS ┆ 0 │ │ EDUCATION_LEVELS_NAME ┆ 0 │ │ MIN_EDULEVELS ┆ 0 │ │ MIN_EDULEVELS_NAME ┆ 0 │ │ MAX_EDULEVELS ┆ 56155 │ └───────────────────────┴───────────┘
In [11]:
# testing the msno heatmap
jobs_filtered_pd = jobs_filtered.to_pandas()
msno.heatmap(jobs_filtered_pd)
Out[11]:
<Axes: >
In [12]:
# showing only columns with NaN values
nan_counts = jobs_filtered.null_count()
total_rows = jobs_filtered.height
nan_counts_filtered = (
nan_counts.unpivot()
.rename({"variable": "Column", "value": "NaN Count"})
.filter(pl.col("NaN Count") > 0)
.with_columns(
(pl.col("NaN Count") / total_rows * 100).alias("NaN Percentage")
)
)
print(nan_counts_filtered.to_pandas())
Column NaN Count NaN Percentage 0 EXPIRED 7822 10.792538 1 DURATION 27294 37.659363 2 ACTIVE_SOURCES_INFO 64654 89.207462 3 TITLE_RAW 60 0.082786 4 MODELED_EXPIRED 15383 21.224957 5 MODELED_DURATION 19261 26.575694 6 COMPANY_RAW 497 0.685744 7 MAX_EDULEVELS 56155 77.480821 8 MAX_EDULEVELS_NAME 56155 77.480821 9 MIN_YEARS_EXPERIENCE 23113 31.890557 10 MAX_YEARS_EXPERIENCE 64046 88.368563 11 SALARY 41658 57.478338 12 ORIGINAL_PAY_PERIOD 40068 55.284508 13 SALARY_TO 40068 55.284508 14 SALARY_FROM 40068 55.284508 15 MSA 3908 5.392130 16 MSA_NAME 3908 5.392130 17 MSA_OUTGOING 3908 5.392130 18 MSA_NAME_OUTGOING 3908 5.392130 19 MSA_INCOMING 3921 5.410067 20 MSA_NAME_INCOMING 3921 5.410067 21 TITLE_CLEAN 96 0.132458 22 LIGHTCAST_SECTORS 54682 75.448424 23 LIGHTCAST_SECTORS_NAME 54682 75.448424
In [13]:
# visualize
nan_counts_filtered_pd = nan_counts_filtered.to_pandas()
fig = px.bar(
nan_counts_filtered_pd,
x="Column",
y="NaN Percentage",
color="NaN Percentage",
title="NaN Percentages per Column",
color_continuous_scale="Viridis"
)
fig.update_layout(width=1200, height=500, xaxis_tickangle=-45)
fig.show()
a bit more cleaning up (round 2)...
In [14]:
# removing columns comprised of 50% or higher NaN values (except SALARY, SALARY_FROM, SALARY_TO)
columns_to_drop = [
"ACTIVE_SOURCES_INFO", "MAX_EDULEVELS", "MAX_EDULEVELS_NAME", "MAX_YEARS_EXPERIENCE",
"ORIGINAL_PAY_PERIOD", "LIGHTCAST_SECTORS", "LIGHTCAST_SECTORS_NAME"
]
jobs_filtered_2 = jobs_filtered.drop(columns_to_drop)
In [15]:
print(jobs_filtered_2.shape)
(72476, 87)
In [16]:
# and also remove duplicates
jobs_filtered_2 = jobs_filtered_2.unique(subset=["TITLE", "COMPANY", "LOCATION", "POSTED"], keep="first")
In [17]:
print(jobs_filtered_2.shape)
(69200, 87)
In [18]:
nan_counts = jobs_filtered_2.null_count()
total_rows = jobs_filtered_2.height
nan_counts_filtered = (
nan_counts.unpivot()
.rename({"variable": "Column", "value": "NaN Count"})
.filter(pl.col("NaN Count") > 0)
.with_columns(
(pl.col("NaN Count") / total_rows * 100).alias("NaN Percentage")
)
)
print(nan_counts_filtered.to_pandas())
Column NaN Count NaN Percentage 0 EXPIRED 7462 10.783237 1 DURATION 26092 37.705202 2 TITLE_RAW 54 0.078035 3 MODELED_EXPIRED 14739 21.299133 4 MODELED_DURATION 18401 26.591040 5 COMPANY_RAW 489 0.706647 6 MIN_YEARS_EXPERIENCE 22339 32.281792 7 SALARY 39954 57.736994 8 SALARY_TO 38490 55.621387 9 SALARY_FROM 38490 55.621387 10 MSA 2806 4.054913 11 MSA_NAME 2806 4.054913 12 MSA_OUTGOING 2806 4.054913 13 MSA_NAME_OUTGOING 2806 4.054913 14 MSA_INCOMING 2816 4.069364 15 MSA_NAME_INCOMING 2816 4.069364 16 TITLE_CLEAN 88 0.127168
In [19]:
# insight into remote jobs
remote_type_series = jobs_filtered_2.get_column("REMOTE_TYPE_NAME")
value_counts = remote_type_series.value_counts(sort=True)
print(value_counts)
shape: (4, 2) ┌──────────────────┬───────┐ │ REMOTE_TYPE_NAME ┆ count │ │ --- ┆ --- │ │ str ┆ u32 │ ╞══════════════════╪═══════╡ │ [None] ┆ 54211 │ │ Remote ┆ 11745 │ │ Hybrid Remote ┆ 2151 │ │ Not Remote ┆ 1093 │ └──────────────────┴───────┘
go ahead with a SQL connection...
In [20]:
conn = sqlite3.connect(':memory:')
conn.execute("DROP TABLE IF EXISTS jobs;")
columns = ", ".join([f"{col} TEXT" for col in jobs_filtered_2.columns])
create_table_query = f"CREATE TABLE jobs ({columns});"
conn.execute(create_table_query)
insert_query = f"INSERT INTO jobs VALUES ({', '.join(['?'] * len(jobs_filtered_2.columns))})"
conn.executemany(insert_query, jobs_filtered_2.to_numpy().tolist())
conn.commit()
print("Data from jobs_filtered_2 has been successfully inserted into the SQLite database.")
Data from jobs_filtered_2 has been successfully inserted into the SQLite database.
In [21]:
query_count = """
SELECT COUNT(*) AS TotalCount
FROM jobs;
"""
cursor = conn.cursor()
cursor.execute(query_count)
result = cursor.fetchall()
total_count = pl.DataFrame(result, schema=["TotalCount"])
print(total_count)
shape: (1, 1) ┌────────────┐ │ TotalCount │ │ --- │ │ i64 │ ╞════════════╡ │ 69200 │ └────────────┘
by NAICS_2022_6_NAME
In [22]:
unique_naics_count = jobs_filtered_2.select(pl.col("NAICS_2022_6_NAME").n_unique()).to_numpy()
print(f"Unique NAICS_2022_6_NAME count: {unique_naics_count[0][0]}")
Unique NAICS_2022_6_NAME count: 814
In [23]:
# We're interseted in the finance, marketing, and transportation industries
query_filtered_naics_count = """
SELECT COUNT(DISTINCT NAICS_2022_6_NAME) AS Unique_NAICS_Count
FROM jobs
WHERE NAICS_2022_6_NAME LIKE '%finance%'
OR NAICS_2022_6_NAME LIKE '%banking%'
OR NAICS_2022_6_NAME LIKE '%investing%'
OR NAICS_2022_6_NAME LIKE '%investments%'
OR NAICS_2022_6_NAME LIKE '%marketing%'
OR NAICS_2022_6_NAME LIKE '%transportation%';
"""
cursor = conn.cursor()
cursor.execute(query_filtered_naics_count)
result = cursor.fetchall()
unique_filtered_naics_count_df = pl.DataFrame(result, schema=["Unique_NAICS_Count"])
print(unique_filtered_naics_count_df)
shape: (1, 1) ┌────────────────────┐ │ Unique_NAICS_Count │ │ --- │ │ i64 │ ╞════════════════════╡ │ 28 │ └────────────────────┘
In [24]:
pl.Config.set_tbl_rows(28)
query_filtered_naics = """
SELECT DISTINCT NAICS_2022_6_NAME
FROM jobs
WHERE NAICS_2022_6_NAME LIKE '%finance%'
OR NAICS_2022_6_NAME LIKE '%banking%'
OR NAICS_2022_6_NAME LIKE '%investing%'
OR NAICS_2022_6_NAME LIKE '%investments%'
OR NAICS_2022_6_NAME LIKE '%marketing%'
OR NAICS_2022_6_NAME LIKE '%transportation%';
"""
cursor = conn.cursor()
cursor.execute(query_filtered_naics)
result = cursor.fetchall()
filtered_naics_df = pl.DataFrame(result, schema=["NAICS_2022_6_NAME"], orient="row")
print(filtered_naics_df.shape)
(28, 1)
In [25]:
print(filtered_naics_df)
shape: (28, 1) ┌─────────────────────────────────┐ │ NAICS_2022_6_NAME │ │ --- │ │ str │ ╞═════════════════════════════════╡ │ Marketing Consulting Services │ │ Commercial Banking │ │ Scheduled Passenger Air Transp… │ │ Telemarketing Bureaus and Othe… │ │ Investment Banking and Securit… │ │ Regulation of Agricultural Mar… │ │ Freight Transportation Arrange… │ │ Special Needs Transportation │ │ Public Finance Activities │ │ Marketing Research and Public … │ │ Nonscheduled Chartered Passeng… │ │ Nonscheduled Chartered Freight… │ │ School and Employee Bus Transp… │ │ Regulation and Administration … │ │ Scenic and Sightseeing Transpo… │ │ Other Support Activities for A… │ │ Pipeline Transportation of Nat… │ │ Pipeline Transportation of Cru… │ │ All Other Transportation Equip… │ │ Deep Sea Freight Transportatio… │ │ Scheduled Freight Air Transpor… │ │ All Other Pipeline Transportat… │ │ Inland Water Freight Transport… │ │ Other Support Activities for R… │ │ All Other Transit and Ground P… │ │ Pipeline Transportation of Ref… │ │ Support Activities for Rail Tr… │ │ Transportation Equipment and S… │ └─────────────────────────────────┘
In [26]:
#drop any filtered NAICS_2022_6_NAME(s) without salary information
query_avg_salary_by_filtered_naics = """
SELECT
NAICS_2022_6_NAME,
COUNT(NAICS_2022_6_NAME) AS Occurrences,
AVG(SALARY) AS Avg_Salary,
AVG(SALARY_FROM) AS Avg_Salary_From,
AVG(SALARY_TO) AS Avg_Salary_To
FROM jobs
WHERE SALARY IS NOT NULL AND SALARY_TO IS NOT NULL AND SALARY_FROM IS NOT NULL
AND (NAICS_2022_6_NAME LIKE '%finance%'
OR NAICS_2022_6_NAME LIKE '%banking%'
OR NAICS_2022_6_NAME LIKE '%investing%'
OR NAICS_2022_6_NAME LIKE '%investments%'
OR NAICS_2022_6_NAME LIKE '%marketing%'
OR NAICS_2022_6_NAME LIKE '%transportation%')
GROUP BY NAICS_2022_6_NAME
ORDER BY Occurrences DESC;
"""
cursor = conn.cursor()
cursor.execute(query_avg_salary_by_filtered_naics)
result = cursor.fetchall()
pl.Config.set_tbl_rows(28)
pl.Config.set_tbl_cols(None)
avg_salary_by_filtered_naics_df = pl.DataFrame(result, schema=["NAICS_2022_6_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")
print(avg_salary_by_filtered_naics_df.shape)
(23, 5)
In [27]:
# NOTE: drops 5
print(avg_salary_by_filtered_naics_df)
shape: (23, 5) ┌─────────────────────────────────┬─────────────┬───────────────┬─────────────────┬───────────────┐ │ NAICS_2022_6_NAME ┆ Occurrences ┆ Avg_Salary ┆ Avg_Salary_From ┆ Avg_Salary_To │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ f64 ┆ f64 ┆ f64 │ ╞═════════════════════════════════╪═════════════╪═══════════════╪═════════════════╪═══════════════╡ │ Commercial Banking ┆ 922 ┆ 127314.572668 ┆ 103130.91757 ┆ 151398.733189 │ │ Telemarketing Bureaus and Othe… ┆ 201 ┆ 90259.129353 ┆ 72430.199005 ┆ 107765.024876 │ │ Investment Banking and Securit… ┆ 94 ┆ 120806.776596 ┆ 95981.223404 ┆ 144876.382979 │ │ Marketing Consulting Services ┆ 48 ┆ 79475.604167 ┆ 69490.916667 ┆ 87989.833333 │ │ Regulation and Administration … ┆ 28 ┆ 93925.928571 ┆ 76147.75 ┆ 111066.857143 │ │ Scheduled Passenger Air Transp… ┆ 18 ┆ 121229.555556 ┆ 105449.777778 ┆ 137009.444444 │ │ Marketing Research and Public … ┆ 14 ┆ 90237.857143 ┆ 73339.285714 ┆ 107136.428571 │ │ Public Finance Activities ┆ 13 ┆ 108434.923077 ┆ 85725.0 ┆ 131145.153846 │ │ Regulation of Agricultural Mar… ┆ 11 ┆ 85541.909091 ┆ 73135.454545 ┆ 97948.636364 │ │ Freight Transportation Arrange… ┆ 10 ┆ 79420.3 ┆ 70311.1 ┆ 88529.8 │ │ Other Support Activities for A… ┆ 6 ┆ 82500.0 ┆ 71333.333333 ┆ 93666.666667 │ │ Nonscheduled Chartered Freight… ┆ 6 ┆ 108785.333333 ┆ 91843.166667 ┆ 125727.666667 │ │ Pipeline Transportation of Nat… ┆ 3 ┆ 98722.666667 ┆ 76333.333333 ┆ 120000.0 │ │ Special Needs Transportation ┆ 2 ┆ 62565.5 ┆ 57566.0 ┆ 65286.5 │ │ School and Employee Bus Transp… ┆ 2 ┆ 62650.0 ┆ 61090.0 ┆ 64210.0 │ │ Scenic and Sightseeing Transpo… ┆ 2 ┆ 80069.5 ┆ 67591.0 ┆ 92548.0 │ │ All Other Transit and Ground P… ┆ 2 ┆ 90675.0 ┆ 77500.0 ┆ 103850.0 │ │ Support Activities for Rail Tr… ┆ 1 ┆ 150000.0 ┆ 140000.0 ┆ 160000.0 │ │ Scheduled Freight Air Transpor… ┆ 1 ┆ 62400.0 ┆ 62400.0 ┆ 62400.0 │ │ Other Support Activities for R… ┆ 1 ┆ 55000.0 ┆ 50000.0 ┆ 60000.0 │ │ Nonscheduled Chartered Passeng… ┆ 1 ┆ 47008.0 ┆ 47008.0 ┆ 47008.0 │ │ Deep Sea Freight Transportatio… ┆ 1 ┆ 131400.0 ┆ 111300.0 ┆ 151500.0 │ │ All Other Transportation Equip… ┆ 1 ┆ 89523.0 ┆ 73299.0 ┆ 105747.0 │ └─────────────────────────────────┴─────────────┴───────────────┴─────────────────┴───────────────┘
by TITLE_NAME
In [28]:
unique_title_count = jobs_filtered_2.select(pl.col("TITLE_NAME").n_unique()).to_numpy()
print(f"Unique TITLE_NAME count: {unique_title_count[0][0]}")
Unique TITLE_NAME count: 5720
In [29]:
# We're interested in analyst positions
pl.Config.set_tbl_rows(5000)
query_filtered_title = """
SELECT DISTINCT TITLE_NAME
FROM jobs
WHERE TITLE_NAME LIKE '%analyst%'
OR TITLE_NAME LIKE '%analytics%'
OR TITLE_NAME LIKE '%data%';
"""
cursor = conn.cursor()
cursor.execute(query_filtered_title)
result = cursor.fetchall()
filtered_title_df = pl.DataFrame(result, schema=["TITLE_NAME"], orient="row")
print(filtered_title_df.shape)
(1476, 1)
In [30]:
print(filtered_title_df.head(20))
shape: (20, 1) ┌─────────────────────────────────┐ │ TITLE_NAME │ │ --- │ │ str │ ╞═════════════════════════════════╡ │ Data Analytics Interns │ │ Technical Data Analysts │ │ Data Analysts │ │ Transformation Analysts │ │ Data Migration Analysts │ │ Analytics Engineers │ │ Threat Intelligence Analysts │ │ Regulatory Analysts │ │ Lead Data Analysts │ │ IT Data Analysts │ │ Insider Threat Analysts │ │ SAP Business Analysts │ │ Process Analysts │ │ IM Analysts │ │ Principal Data Scientists │ │ SAP SD Analysts │ │ Business Intelligence Reportin… │ │ Application Analysts │ │ Workforce Intelligence Analyst… │ │ Salesforce Business Systems An… │ └─────────────────────────────────┘
In [31]:
#drop any filtered TITLE_NAMES(s) without salary information
query_avg_salary_by_filtered_title = """
SELECT
TITLE_NAME,
COUNT(TITLE_NAME) AS Occurrences,
AVG(SALARY) AS Avg_Salary,
AVG(SALARY_FROM) AS Avg_Salary_From,
AVG(SALARY_TO) AS Avg_Salary_To
FROM jobs
WHERE SALARY IS NOT NULL AND SALARY_TO IS NOT NULL AND SALARY_FROM IS NOT NULL
AND (TITLE_NAME LIKE '%analyst%'
OR TITLE_NAME LIKE '%analytics%'
OR TITLE_NAME LIKE '%data%')
GROUP BY TITLE_NAME
ORDER BY Occurrences DESC;
"""
cursor = conn.cursor()
cursor.execute(query_avg_salary_by_filtered_title)
result = cursor.fetchall()
pl.Config.set_tbl_rows(1500)
pl.Config.set_tbl_cols(None)
avg_salary_by_filtered_title_df = pl.DataFrame(result, schema=["TITLE_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")
print(avg_salary_by_filtered_title_df.shape)
(1005, 5)
In [32]:
# NOTE: drops 471
print(avg_salary_by_filtered_title_df.head(20))
shape: (20, 5) ┌─────────────────────────────────┬─────────────┬───────────────┬─────────────────┬───────────────┐ │ TITLE_NAME ┆ Occurrences ┆ Avg_Salary ┆ Avg_Salary_From ┆ Avg_Salary_To │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ i64 ┆ f64 ┆ f64 ┆ f64 │ ╞═════════════════════════════════╪═════════════╪═══════════════╪═════════════════╪═══════════════╡ │ Data Analysts ┆ 3467 ┆ 93825.56533 ┆ 81035.9152 ┆ 106047.649841 │ │ Business Intelligence Analysts ┆ 938 ┆ 103024.716418 ┆ 87234.638593 ┆ 118629.098081 │ │ Data Analytics Engineers ┆ 325 ┆ 185512.461538 ┆ 161054.243077 ┆ 209111.68 │ │ Data and Reporting Analysts ┆ 290 ┆ 79633.92069 ┆ 66126.458621 ┆ 91923.089655 │ │ Data Governance Analysts ┆ 228 ┆ 111827.557018 ┆ 94360.548246 ┆ 128478.942982 │ │ Data Quality Analysts ┆ 193 ┆ 104164.165803 ┆ 91157.621762 ┆ 116710.108808 │ │ Data Analytics Analysts ┆ 192 ┆ 109418.296875 ┆ 87121.588542 ┆ 131655.473958 │ │ Data Management Analysts ┆ 178 ┆ 107976.398876 ┆ 92009.02809 ┆ 123313.651685 │ │ Data Modelers ┆ 172 ┆ 136975.895349 ┆ 118876.593023 ┆ 155075.238372 │ │ Lead Data Analysts ┆ 170 ┆ 111721.929412 ┆ 95335.964706 ┆ 127742.011765 │ │ Research Data Analysts ┆ 166 ┆ 81899.710843 ┆ 69225.656627 ┆ 94308.771084 │ │ IT Data Analytics Analysts ┆ 164 ┆ 107843.939024 ┆ 84157.25 ┆ 131339.04878 │ │ Lead Business Intelligence Ana… ┆ 151 ┆ 120744.960265 ┆ 100244.437086 ┆ 140527.245033 │ │ Data Science Analysts ┆ 133 ┆ 114863.992481 ┆ 96807.609023 ┆ 132389.849624 │ │ Data Analytics Leads ┆ 120 ┆ 162420.35 ┆ 120716.908333 ┆ 203772.808333 │ │ Data Operations Analysts ┆ 114 ┆ 88823.359649 ┆ 73918.745614 ┆ 102379.657895 │ │ Business Intelligence Data Ana… ┆ 113 ┆ 103099.938053 ┆ 87012.858407 ┆ 119073.752212 │ │ Health Data Analysts ┆ 107 ┆ 91458.654206 ┆ 74872.214953 ┆ 107990.401869 │ │ Data and Analytics Consultants ┆ 107 ┆ 127020.224299 ┆ 94460.691589 ┆ 158536.672897 │ │ Enterprise Data Architects ┆ 106 ┆ 166127.603774 ┆ 146011.613208 ┆ 186243.622642 │ └─────────────────────────────────┴─────────────┴───────────────┴─────────────────┴───────────────┘
now use both of the queries...
In [33]:
# Analyst positions within the Industries we're interested in
query_avg_salary_by_filtered_title_naics = """
SELECT
TITLE_NAME,
NAICS_2022_6_NAME,
COUNT(TITLE_NAME) AS Occurrences,
AVG(SALARY) AS Avg_Salary,
AVG(SALARY_FROM) AS Avg_Salary_From,
AVG(SALARY_TO) AS Avg_Salary_To
FROM jobs
WHERE SALARY IS NOT NULL AND SALARY_TO IS NOT NULL AND SALARY_FROM IS NOT NULL
AND (TITLE_NAME LIKE '%analyst%'
OR TITLE_NAME LIKE '%analytics%'
OR TITLE_NAME LIKE '%data%')
AND (NAICS_2022_6_NAME LIKE '%finance%'
OR NAICS_2022_6_NAME LIKE '%banking%'
OR NAICS_2022_6_NAME LIKE '%investing%'
OR NAICS_2022_6_NAME LIKE '%investments%'
OR NAICS_2022_6_NAME LIKE '%marketing%'
OR NAICS_2022_6_NAME LIKE '%transportation%')
GROUP BY TITLE_NAME, NAICS_2022_6_NAME
ORDER BY Occurrences DESC;
"""
cursor = conn.cursor()
cursor.execute(query_avg_salary_by_filtered_title_naics)
result = cursor.fetchall()
pl.Config.set_tbl_rows(250)
pl.Config.set_tbl_cols(None)
avg_salary_by_filtered_title_naics_df = pl.DataFrame(result, schema=["TITLE_NAME", "NAICS_2022_6_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")
print(avg_salary_by_filtered_title_naics_df.shape)
(244, 6)
In [34]:
# 244 remaining
print(avg_salary_by_filtered_title_naics_df.head(20))
shape: (20, 6) ┌─────────────────┬─────────────────┬─────────────┬───────────────┬────────────────┬───────────────┐ │ TITLE_NAME ┆ NAICS_2022_6_NA ┆ Occurrences ┆ Avg_Salary ┆ Avg_Salary_Fro ┆ Avg_Salary_To │ │ --- ┆ ME ┆ --- ┆ --- ┆ m ┆ --- │ │ str ┆ --- ┆ i64 ┆ f64 ┆ --- ┆ f64 │ │ ┆ str ┆ ┆ ┆ f64 ┆ │ ╞═════════════════╪═════════════════╪═════════════╪═══════════════╪════════════════╪═══════════════╡ │ Data and ┆ Telemarketing ┆ 65 ┆ 81366.523077 ┆ 64625.384615 ┆ 97198.0 │ │ Reporting ┆ Bureaus and ┆ ┆ ┆ ┆ │ │ Analysts ┆ Othe… ┆ ┆ ┆ ┆ │ │ Health Data ┆ Telemarketing ┆ 51 ┆ 94717.647059 ┆ 75764.705882 ┆ 113670.588235 │ │ Analysts ┆ Bureaus and ┆ ┆ ┆ ┆ │ │ ┆ Othe… ┆ ┆ ┆ ┆ │ │ Data ┆ Commercial ┆ 41 ┆ 172009.756098 ┆ 137607.804878 ┆ 206411.707317 │ │ Integration ┆ Banking ┆ ┆ ┆ ┆ │ │ Leads ┆ ┆ ┆ ┆ ┆ │ │ Data Governance ┆ Commercial ┆ 35 ┆ 129052.342857 ┆ 103377.371429 ┆ 154727.314286 │ │ Analysts ┆ Banking ┆ ┆ ┆ ┆ │ │ Data Analysts ┆ Commercial ┆ 31 ┆ 105933.612903 ┆ 87030.903226 ┆ 124547.258065 │ │ ┆ Banking ┆ ┆ ┆ ┆ │ │ Crime Analysts ┆ Telemarketing ┆ 25 ┆ 46635.28 ┆ 44214.4 ┆ 48977.6 │ │ ┆ Bureaus and ┆ ┆ ┆ ┆ │ │ ┆ Othe… ┆ ┆ ┆ ┆ │ │ Lead Business ┆ Commercial ┆ 23 ┆ 113100.0 ┆ 86840.0 ┆ 139360.0 │ │ Intelligence ┆ Banking ┆ ┆ ┆ ┆ │ │ Ana… ┆ ┆ ┆ ┆ ┆ │ │ Data Management ┆ Commercial ┆ 20 ┆ 159859.65 ┆ 124353.0 ┆ 195366.3 │ │ Analysts ┆ Banking ┆ ┆ ┆ ┆ │ │ Business ┆ Commercial ┆ 19 ┆ 109138.526316 ┆ 88355.842105 ┆ 129921.210526 │ │ Intelligence ┆ Banking ┆ ┆ ┆ ┆ │ │ Analysts ┆ ┆ ┆ ┆ ┆ │ │ Data Quality ┆ Commercial ┆ 17 ┆ 107957.117647 ┆ 85142.588235 ┆ 130771.705882 │ │ Analysts ┆ Banking ┆ ┆ ┆ ┆ │ │ Data Analytics ┆ Commercial ┆ 16 ┆ 167973.3125 ┆ 134563.0 ┆ 201383.625 │ │ Leads ┆ Banking ┆ ┆ ┆ ┆ │ │ Enterprise Risk ┆ Commercial ┆ 16 ┆ 108521.25 ┆ 81160.4375 ┆ 135528.6875 │ │ Analysts ┆ Banking ┆ ┆ ┆ ┆ │ │ Data Analysts ┆ Regulation and ┆ 14 ┆ 93407.285714 ┆ 73797.285714 ┆ 113018.285714 │ │ ┆ Administration ┆ ┆ ┆ ┆ │ │ ┆ … ┆ ┆ ┆ ┆ │ │ Data Warehouse ┆ Commercial ┆ 14 ┆ 86984.714286 ┆ 66977.0 ┆ 106312.0 │ │ Business ┆ Banking ┆ ┆ ┆ ┆ │ │ Analys… ┆ ┆ ┆ ┆ ┆ │ │ Data Quality ┆ Commercial ┆ 13 ┆ 156542.230769 ┆ 129509.615385 ┆ 183574.846154 │ │ Leads ┆ Banking ┆ ┆ ┆ ┆ │ │ Business ┆ Investment ┆ 12 ┆ 125392.5 ┆ 92790.75 ┆ 157995.0 │ │ Intelligence ┆ Banking and ┆ ┆ ┆ ┆ │ │ Analysts ┆ Securit… ┆ ┆ ┆ ┆ │ │ Business ┆ Telemarketing ┆ 11 ┆ 107000.0 ┆ 84927.272727 ┆ 129072.727273 │ │ Intelligence ┆ Bureaus and ┆ ┆ ┆ ┆ │ │ Analysts ┆ Othe… ┆ ┆ ┆ ┆ │ │ Data Analysts ┆ Investment ┆ 10 ┆ 82393.0 ┆ 70166.0 ┆ 94620.0 │ │ ┆ Banking and ┆ ┆ ┆ ┆ │ │ ┆ Securit… ┆ ┆ ┆ ┆ │ │ Data Analytics ┆ Commercial ┆ 10 ┆ 101205.0 ┆ 88850.8 ┆ 113559.2 │ │ Analysts ┆ Banking ┆ ┆ ┆ ┆ │ │ Data ┆ Commercial ┆ 10 ┆ 115110.0 ┆ 92377.0 ┆ 137843.0 │ │ Integration ┆ Banking ┆ ┆ ┆ ┆ │ │ Analysts ┆ ┆ ┆ ┆ ┆ │ └─────────────────┴─────────────────┴─────────────┴───────────────┴────────────────┴───────────────┘
plot 1
In [35]:
avg_salary_by_filtered_title_naics_df = pl.DataFrame(result, schema=["TITLE_NAME", "NAICS_2022_6_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")
avg_salary_by_filtered_title_naics_df_pd = avg_salary_by_filtered_title_naics_df.to_pandas()
top_20_df = avg_salary_by_filtered_title_naics_df_pd.head(20)
fig = px.bar(top_20_df,
x='TITLE_NAME',
y='Occurrences',
color='NAICS_2022_6_NAME',
title="Top 20 Job Titles by Occurrences and their Industry with Applied Filters",
labels={'TITLE_NAME': 'Job Title', 'Occurrences': 'Number of Occurrences'},
hover_data=['Avg_Salary', 'Avg_Salary_From', 'Avg_Salary_To'])
fig.update_layout(xaxis_tickangle=45)
fig.show()
plot 2
In [36]:
query_avg_salary_by_filtered_title_naics_state = """
SELECT
TITLE_NAME,
NAICS_2022_6_NAME,
STATE_NAME,
COUNT(TITLE_NAME) AS Occurrences,
AVG(SALARY) AS Avg_Salary,
AVG(SALARY_FROM) AS Avg_Salary_From,
AVG(SALARY_TO) AS Avg_Salary_To
FROM jobs
WHERE SALARY IS NOT NULL AND SALARY_TO IS NOT NULL AND SALARY_FROM IS NOT NULL
AND (TITLE_NAME LIKE '%analyst%'
OR TITLE_NAME LIKE '%analytics%'
OR TITLE_NAME LIKE '%data%')
AND (NAICS_2022_6_NAME LIKE '%finance%'
OR NAICS_2022_6_NAME LIKE '%banking%'
OR NAICS_2022_6_NAME LIKE '%investing%'
OR NAICS_2022_6_NAME LIKE '%investments%'
OR NAICS_2022_6_NAME LIKE '%marketing%'
OR NAICS_2022_6_NAME LIKE '%transportation%')
GROUP BY STATE_NAME
ORDER BY Occurrences DESC
LIMIT 7;
"""
cursor = conn.cursor()
cursor.execute(query_avg_salary_by_filtered_title_naics_state)
result = cursor.fetchall()
pl.Config.set_tbl_rows(250)
pl.Config.set_tbl_cols(None)
avg_salary_by_filtered_title_naics_state_df = pl.DataFrame(result, schema=["TITLE_NAME", "NAICS_2022_6_NAME", "STATE_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")
avg_salary_by_filtered_title_naics_state_df_pd = avg_salary_by_filtered_title_naics_state_df.to_pandas()
fig = px.pie(avg_salary_by_filtered_title_naics_state_df_pd,
names='STATE_NAME',
values='Occurrences',
title="Top 7 States by Occurrences with Applied Filters",
color='STATE_NAME')
fig.show()
plot 3
In [37]:
avg_salary_by_filtered_title_naics_df_pd = avg_salary_by_filtered_title_naics_df.to_pandas()
fig = px.treemap(
avg_salary_by_filtered_title_naics_df_pd,
path=['NAICS_2022_6_NAME', 'TITLE_NAME'],
values='Occurrences',
color='Avg_Salary',
color_continuous_scale='Viridis',
title='Analyst Positions within Selected Industries with Recorded Salary Information'
)
fig.update_layout(
margin=dict(t=50, l=25, r=25, b=25),
coloraxis_colorbar=dict(
title="Avg Salary",
tickprefix="$"
)
)
fig.show()
In [38]:
conn.close()